"""
Findings repository for managing research findings.
"""

from loguru import logger
from typing import Dict, List, Union

from langchain_core.documents import Document
from langchain_core.language_models import BaseLLM

from ...utilities.search_utilities import format_findings
from .base_findings import BaseFindingsRepository


def format_links(links: List[Dict]) -> str:
    """Format a list of links into a readable string.

    Args:
        links: List of dictionaries containing 'title' and 'url' keys

    Returns:
        str: Formatted string of links
    """
    return "\n".join(
        f"{i + 1}. {link['title']}\n   URL: {link['url']}"
        for i, link in enumerate(links)
    )


class FindingsRepository(BaseFindingsRepository):
    """Repository for managing research findings."""

    def __init__(self, model: BaseLLM):
        """Initialize the repository.

        Args:
            model: The LLM model to use for synthesis
        """
        super().__init__(model)
        self.findings: Dict[str, List[Dict]] = {}
        self.documents: List[Document] = []
        self.questions_by_iteration: Dict[int, List[str]] = {}

    def add_finding(self, query: str, finding: Dict | str) -> None:
        """Add a finding for a query."""
        self.findings.setdefault(query, [])

        # Convert to dictionary if it's a string
        if isinstance(finding, str):
            finding_dict = {
                "phase": "Synthesis",
                "content": finding,
                "question": query,
                "search_results": [],
                "documents": [],
            }
            self.findings[query].append(finding_dict)
        else:
            # It's already a dictionary
            self.findings[query].append(finding)

            # Store raw synthesized content if it's the final synthesis
            # Only check for phase if it's a dictionary
            if (
                isinstance(finding, dict)
                and finding.get("phase") == "Final synthesis"
            ):
                self.findings[query + "_synthesis"] = [
                    {
                        "phase": "Synthesis",
                        "content": finding.get("content", ""),
                        "question": query,
                        "search_results": [],
                        "documents": [],
                    }
                ]

        logger.info(
            f"Added finding for query: {query}. Total findings: {len(self.findings[query])}"
        )

    def get_findings(self, query: str) -> List[Dict]:
        """Get findings for a query.

        Args:
            query: The query to get findings for

        Returns:
            List of findings for the query
        """
        return self.findings.get(query, [])

    def clear_findings(self, query: str) -> None:
        """Clear findings for a query.

        Args:
            query: The query to clear findings for
        """
        if query in self.findings:
            del self.findings[query]
            logger.info(f"Cleared findings for query: {query}")

    def add_documents(self, documents: List[Document]) -> None:
        """Add documents to the repository.

        Args:
            documents: List of documents to add
        """
        self.documents.extend(documents)
        logger.info(f"Added {len(documents)} documents to repository")

    def set_questions_by_iteration(
        self, questions_by_iteration: Dict[int, List[str]]
    ) -> None:
        """Set the questions by iteration.

        Args:
            questions_by_iteration: Dictionary mapping iteration numbers to lists of questions
        """
        self.questions_by_iteration = questions_by_iteration.copy()
        logger.info(
            f"Set questions for {len(questions_by_iteration)} iterations"
        )

    def format_findings_to_text(
        self, findings_list: List[Dict], synthesized_content: str
    ) -> str:
        """Format findings into a detailed text output using the utility function.

        Args:
            findings_list: List of finding dictionaries from the strategy execution.
            synthesized_content: The final synthesized content generated by the LLM.

        Returns:
            str: Formatted text output.
        """
        logger.info(
            f"Formatting final report. Number of detailed findings: {len(findings_list)}. Synthesized content length: {len(synthesized_content)}. Number of question iterations: {len(self.questions_by_iteration)}"
        )
        # Log details about the inputs
        logger.debug(
            f"Detailed findings list structure (first item type if exists): {type(findings_list[0]) if findings_list else 'Empty'}"
        )
        logger.debug(
            f"Questions by iteration keys: {list(self.questions_by_iteration.keys())}"
        )
        if findings_list:
            logger.debug(
                f"First finding item keys: {list(findings_list[0].keys())}"
            )

        try:
            # Pass the detailed findings list, the synthesized content (as current_knowledge), and the stored questions
            formatted_report = format_findings(
                findings_list,
                synthesized_content,  # This goes to the 'current_knowledge' param in format_findings
                self.questions_by_iteration,
            )
            logger.info("Successfully formatted final report.")
            return formatted_report
        except Exception as e:
            logger.exception(
                f"Error occurred during final report formatting: {e!s}"
            )
            # Fallback: return just the synthesized content if formatting fails
            return f"Error during final formatting. Raw Synthesized Content:\n\n{synthesized_content}"

    def synthesize_findings(
        self,
        query: str,
        sub_queries: List[str],
        findings: List[Union[Dict, str]],
        accumulated_knowledge: str = None,
        old_formatting: bool = False,
    ) -> str:
        """
        Synthesize accumulated knowledge into a final answer.

        Args:
            query: The original query
            sub_queries: List of sub-queries (for context)
            findings: List of findings strings or dictionaries from previous steps
            accumulated_knowledge: Optional pre-existing knowledge to incorporate
            old_formatting: Whether to use the old formatting approach

        Returns:
            str: Synthesized final answer content.
        """
        logger.info(f"synthesize_findings called with query: '{query}'")
        logger.info(
            f"sub_queries type: {type(sub_queries)}, length: {len(sub_queries)}"
        )
        logger.info(f"findings type: {type(findings)}, length: {len(findings)}")

        # Use provided accumulated_knowledge or join findings if it's None
        if accumulated_knowledge is None:
            # Convert findings to text if they are dictionaries
            finding_texts = []
            for item in findings:
                if isinstance(item, dict) and "content" in item:
                    finding_texts.append(item["content"])
                elif isinstance(item, str):
                    finding_texts.append(item)
            accumulated_knowledge = "\n\n".join(finding_texts)

        if findings:
            logger.info(f"first finding type: {type(findings[0])}")
            if isinstance(findings[0], dict):
                logger.info(
                    f"first finding keys: {list(findings[0].keys()) if hasattr(findings[0], 'keys') else 'No keys'}"
                )
                if "content" in findings[0]:
                    logger.info(
                        f"first finding content type: {type(findings[0]['content'])}"
                    )
            elif isinstance(findings[0], str):
                logger.info(f"first finding string length: {len(findings[0])}")
                logger.info(
                    f"first finding string preview: {findings[0][:100]}..."
                )

        if old_formatting:
            # Convert findings list if it contains strings instead of dictionaries
            findings_list = []
            for i, item in enumerate(findings):
                if isinstance(item, str):
                    findings_list.append(
                        {"phase": f"Finding {i + 1}", "content": item}
                    )
                elif isinstance(item, dict):
                    findings_list.append(item)

            return format_findings(
                findings_list=findings_list,
                synthesized_content=accumulated_knowledge,
                questions_by_iteration=self.questions_by_iteration,
            )
        try:
            # Extract finding content texts for the prompt
            finding_texts = []
            for item in findings:
                if isinstance(item, dict) and "content" in item:
                    finding_texts.append(item["content"])
                elif isinstance(item, str):
                    finding_texts.append(item)

            # Use finding_texts for the prompt
            current_knowledge = (
                "\n\n".join(finding_texts) if finding_texts else ""
            )

            # Check if knowledge exceeds a reasonable token limit (rough estimate based on characters)
            # 1 token ≈ 4 characters in English
            estimated_tokens = len(current_knowledge) / 4
            max_safe_tokens = (
                12000  # Adjust based on your model's context window
            )

            if estimated_tokens > max_safe_tokens:
                logger.warning(
                    f"Knowledge size may exceed model's capacity: ~{int(estimated_tokens)} tokens"
                )
                # Truncate if needed (keeping the beginning and end which are often most important)
                # This is a simple approach - a more sophisticated chunking might be better
                if len(current_knowledge) > 24000:  # ~6000 tokens
                    first_part = current_knowledge[
                        :12000
                    ]  # ~3000 tokens from start
                    last_part = current_knowledge[
                        -12000:
                    ]  # ~3000 tokens from end
                    current_knowledge = f"{first_part}\n\n[...content truncated due to length...]\n\n{last_part}"
                    logger.info(
                        "Knowledge truncated to fit within token limits"
                    )

            prompt = f"""Use IEEE style citations [1], [2], etc. Never make up your own citations. Synthesize the following accumulated knowledge into a comprehensive answer for the original query.
Format the response with clear sections, citations, and a concise summary.

Original Query: {query}

Accumulated Knowledge:
{current_knowledge}

Sub-questions asked (for context):
{chr(10).join(f"- {sq}" for sq in sub_queries)}

Generate a well-structured, concise answer that:
1. Starts with a clear explanation of the most important points
2. Organizes information into logical sections with headers if needed
3. Maintains logical flow and prioritizes important information over minor details
4. Avoids repetition and unnecessary detail

Use IEEE style citations [1], [2], etc. Never make up your own citations.
"""

            logger.info(
                f"Synthesizing final answer. Query: '{query}'. Knowledge length: {len(current_knowledge)}. Prompt length: {len(prompt)}"
            )
            # Log first 500 chars of prompt for debugging context length issues
            logger.debug(
                f"Synthesis prompt (first 500 chars): {prompt[:500]}..."
            )

            try:
                # Add timeout handling
                import platform
                import signal
                import threading
                from contextlib import contextmanager

                # Check if we're on Windows
                if platform.system() == "Windows":

                    def timeout_handler(timeout_seconds, callback, args):
                        def handler():
                            callback(*args)

                        timer = threading.Timer(timeout_seconds, handler)
                        timer.daemon = True
                        return timer

                    def invoke_with_timeout(
                        timeout_seconds, func, *args, **kwargs
                    ):
                        """
                        Function for implementing timeouts on Windows
                        """
                        result = None
                        exception = None
                        completed = False

                        def target():
                            nonlocal result, exception, completed
                            try:
                                result = func(*args, **kwargs)
                                completed = True
                            except Exception as e:
                                exception = e

                        thread = threading.Thread(target=target)
                        thread.daemon = True

                        try:
                            thread.start()
                            thread.join(timeout_seconds)
                            if not completed and thread.is_alive():
                                raise TimeoutError(
                                    f"Operation timed out after {timeout_seconds} seconds"
                                )
                            if exception:
                                raise exception
                            return result
                        finally:
                            # Nothing to clean up
                            pass

                    # Use Windows-compatible timeout
                    try:
                        logger.info(
                            "Using Windows-compatible timeout for LLM invocation"
                        )
                        response = invoke_with_timeout(
                            120, self.model.invoke, prompt
                        )

                        # Handle different response types (string or object with content attribute)
                        if hasattr(response, "content"):
                            synthesized_content = response.content
                        else:
                            # Handle string responses
                            synthesized_content = str(response)

                        logger.info(
                            f"Successfully synthesized final answer for query: '{query}'"
                        )
                        # Return only the synthesized content from the LLM
                        return synthesized_content
                    except TimeoutError as timeout_error:
                        logger.exception(
                            f"LLM invocation timed out during synthesis for query '{query}': {timeout_error}"
                        )
                        # Return more specific error about timeout
                        return "Error: Final answer synthesis failed due to LLM timeout. Please check your LLM service or try with a smaller query scope."

                else:
                    # Unix-compatible timeout using SIGALRM
                    @contextmanager
                    def timeout(seconds, message="Operation timed out"):
                        def signal_handler(signum, frame):
                            raise TimeoutError(message)

                        signal.signal(signal.SIGALRM, signal_handler)
                        signal.alarm(seconds)
                        try:
                            yield
                        finally:
                            signal.alarm(0)

                    # Try with a timeout (adjust seconds as needed)
                    try:
                        with timeout(
                            120, "LLM invocation timed out after 120 seconds"
                        ):
                            response = self.model.invoke(prompt)

                            # Handle different response types (string or object with content attribute)
                            if hasattr(response, "content"):
                                synthesized_content = response.content
                            else:
                                # Handle string responses
                                synthesized_content = str(response)

                            logger.info(
                                f"Successfully synthesized final answer for query: '{query}'"
                            )
                            # Return only the synthesized content from the LLM
                            return synthesized_content
                    except TimeoutError as timeout_error:
                        logger.exception(
                            f"LLM invocation timed out during synthesis for query '{query}': {timeout_error}"
                        )
                        # Return more specific error about timeout
                        return "Error: Final answer synthesis failed due to LLM timeout. Please check your LLM service or try with a smaller query scope."

            except Exception as invoke_error:
                logger.exception(
                    f"LLM invocation failed during synthesis for query '{query}': {invoke_error}"
                )

                # Attempt to determine the type of error
                error_message = str(invoke_error).lower()
                error_type = "unknown"

                if "timeout" in error_message or "timed out" in error_message:
                    error_type = "timeout"
                elif (
                    "too many tokens" in error_message
                    or "context length" in error_message
                    or "token limit" in error_message
                ):
                    error_type = "token_limit"
                elif (
                    "rate limit" in error_message
                    or "rate_limit" in error_message
                ):
                    error_type = "rate_limit"
                elif (
                    "connection" in error_message or "network" in error_message
                ):
                    error_type = "connection"
                elif (
                    "api key" in error_message
                    or "authentication" in error_message
                ):
                    error_type = "authentication"

                # Return more detailed error message based on type
                if error_type == "timeout":
                    return "Error: Failed to synthesize final answer due to LLM timeout. Please check your connection or try again later."
                elif error_type == "token_limit":
                    return "Error: Failed to synthesize final answer due to token limit exceeded. Try reducing the scope of your query."
                elif error_type == "rate_limit":
                    return "Error: Failed to synthesize final answer due to LLM rate limit. Please try again in a few minutes."
                elif error_type == "connection":
                    return "Error: Failed to synthesize final answer due to connection issues. Please check your internet connection and LLM service status."
                elif error_type == "authentication":
                    return "Error: Failed to synthesize final answer due to authentication issues. Please check your API keys."
                else:
                    # Generic error with details
                    return f"Error: Failed to synthesize final answer. LLM error: {invoke_error!s}"

        except Exception as e:
            # Catch potential errors during prompt construction or logging itself
            logger.exception(
                f"Error preparing or executing synthesis for query '{query}': {e!s}"
            )
            # Return a specific error message for synthesis failure
            return f"Error: Failed to synthesize final answer from knowledge. Details: {e!s}"
